{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature: Character N-Gram Jaccard Index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate Jaccard similarities between sets of character $n$-grams for different values of $n$."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identifier for storing these features on disk and referring to them later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_list_id = 'jaccard_ngrams'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Range of $n$ to try for the $n$-grams."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "NGRAM_RANGE = range(2, 6)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Preprocessed and tokenized questions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_spellcheck_train.pickle')\n",
    "tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_spellcheck_test.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens = tokens_train + tokens_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Character $n$-gram similarities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_char_ngrams(doc, n):\n",
    "    return [doc[i:i + n] for i in range(len(doc) - n + 1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_jaccard_set_similarities(a, b):\n",
    "    len_intersection = len(a.intersection(b))\n",
    "    jaccard_index = len_intersection / len(a.union(b))\n",
    "    jaccard_index_norm_a = len_intersection / len(a)\n",
    "    jaccard_index_norm_b = len_intersection / len(b)\n",
    "    \n",
    "    return jaccard_index, jaccard_index_norm_a, jaccard_index_norm_b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_jaccard_similarities(q1, q2, n):\n",
    "    if len(q1) < max(NGRAM_RANGE) and len(q2) < max(NGRAM_RANGE):\n",
    "        return 1, 1, 1\n",
    "    if len(q1) < max(NGRAM_RANGE) or len(q2) < max(NGRAM_RANGE):\n",
    "        return 0, 0, 0\n",
    "    \n",
    "    q1_ngrams = set(get_char_ngrams(q1, n))\n",
    "    q2_ngrams = set(get_char_ngrams(q2, n))\n",
    "    return get_jaccard_set_similarities(q1_ngrams, q2_ngrams)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_question_pair_features(pair):\n",
    "    q1 = ' '.join(pair[0])\n",
    "    q2 = ' '.join(pair[1])\n",
    "    \n",
    "    features = []\n",
    "    for n in NGRAM_RANGE:\n",
    "        features.extend(get_jaccard_similarities(q1, q2, n))\n",
    "    \n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 2751/2751 [11:25<00:00,  4.01it/s]\n"
     ]
    }
   ],
   "source": [
    "features = kg.jobs.map_batch_parallel(\n",
    "    tokens,\n",
    "    item_mapper=get_question_pair_features,\n",
    "    batch_size=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "for n in NGRAM_RANGE:\n",
    "    feature_names.append(f'jaccard_ix_{n}gram')\n",
    "    feature_names.append(f'jaccard_ix_norm_q1_{n}gram')\n",
    "    feature_names.append(f'jaccard_ix_norm_q2_{n}gram')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pairwise similarity differences for $n$ and $n+1$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(features, columns=feature_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "for n in NGRAM_RANGE[:-1]:\n",
    "    m = n + 1\n",
    "    diff_feature_name = f'jaccard_ix_diff_{n}_{m}'\n",
    "    df[diff_feature_name]= np.abs(df[f'jaccard_ix_{n}gram'] - df[f'jaccard_ix_{m}gram'])\n",
    "    feature_names.append(diff_feature_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build final features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = np.array(df.values[:len(tokens_train)], dtype='float64')\n",
    "X_test = np.array(df.values[len(tokens_train):], dtype='float64')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train: (404290, 15)\n",
      "X_test:  (2345796, 15)\n"
     ]
    }
   ],
   "source": [
    "print('X_train:', X_train.shape)\n",
    "print('X_test: ', X_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "project.save_features(X_train, X_test, feature_names, feature_list_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
